1 Plot One Variable - X: Continuous or Discrete

For one continuous variable = Numeric:

  • geom_area()
  • geom_density()
  • geom_histogram()
  • geom_freqpoly()
  • geom_dotplot()
  • stat_ecdf()
  • stat_qq()

For one discrete varaible = Factor:

  • geom_bar()
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1.1 Area Plots

alpha, color, fill, linetype, size

set.seed(1234)
wdata = as_data_frame(data.frame(sex = factor(rep(c("F", "M"), each=200)), weight = c(rnorm(200,55),rnorm(200,58))))
wdata
## # A tibble: 400 × 2
##       sex   weight
##    <fctr>    <dbl>
## 1       F 53.79293
## 2       F 55.27743
## 3       F 56.08444
## 4       F 52.65430
## 5       F 55.42912
## 6       F 55.50606
## 7       F 54.42526
## 8       F 54.45337
## 9       F 54.43555
## 10      F 54.10996
## # ... with 390 more rows
mu <- wdata %>% group_by(sex) %>% summarize(grp.mean = mean(weight))
mu
## # A tibble: 2 × 2
##      sex grp.mean
##   <fctr>    <dbl>
## 1      F 54.94224
## 2      M 58.07325
a <- ggplot(wdata, aes(x = weight))

a + geom_area(stat = "bin", color = "black", fill = "#00AFBB")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#a + geom_area() will not get right result, object 'y' not found. Use stat to specify the count as y 
#Note that, by default y axis corresponds to the count of weight values. If you want to change the plot in order to have the density on y axis, the R code would be as follow.
a + geom_area(aes(y = ..density..), stat = "bin")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data("diamonds")
diamonds <- as_data_frame(diamonds)
diamonds
## # A tibble: 53,940 × 10
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1   0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2   0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3   0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4   0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5   0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6   0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
## 7   0.24 Very Good     I    VVS1  62.3    57   336  3.95  3.98  2.47
## 8   0.26 Very Good     H     SI1  61.9    55   337  4.07  4.11  2.53
## 9   0.22      Fair     E     VS2  65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338  4.00  4.05  2.39
## # ... with 53,930 more rows
p <- ggplot(diamonds, aes(x = price, fill = cut))
# Bar plot
p + geom_bar(stat = "bin")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Area plot
p + geom_area(stat = "bin")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

1.2 Density Plots

alpha, color, fill, linetype, size

  • scale_color_manual(), scale_fill_manual()
  • scale_color_brewer(), scale_fill_brewer() RColor-Brewer
  • scale_color_grey(), scale_fill_grey()
# Basic plots
a + geom_density()

# Add color and mean xintercept and median xintercept
a + geom_density(color = "black", fill = "gray") + geom_vline(aes(xintercept = mean(weight)), color = "#FC4E08", linetype = "dashed", size = 1) + geom_vline(aes(xintercept = median(weight)), color = "blue", linetype = 4, size = 1)

# Change color by group
a + geom_density(aes(fill = sex), alpha = 0.4) 

# Add mean lines and color by sex
a + geom_density(aes(fill = sex), alpha = 0.4) + geom_vline(data = mu, aes(xintercept = grp.mean, color = sex), linetype = "dashed")

# Change manually 
# change line manually
a2 <- a + geom_density(aes(color = sex)) + geom_vline(data = mu, aes(xintercept = grp.mean, color = sex), linetype = "dashed") + theme_minimal()

a2 + scale_color_manual(values = c("#999999", "#E69F00"))

a2 + scale_color_brewer(palette = "Paired")

a2 + scale_color_grey()

# change fill manually
a3 <- a + geom_density(aes(fill = sex), alpha = 0.4) + theme_minimal()

a3 + scale_fill_manual(values = c("#999999", "#E69F00"))

a3 + scale_fill_brewer(palette = "Dark2")

a3 + scale_fill_grey()

1.3 Histogram Plots

identity(position_identity()), stack(position_stack()), dodge(position_dodge()); Default values is “stack”

alpha, color, fill, linetype, size

# Basic plot
a + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(bins = 50)

#Note that by default, stat_bin uses 30 bins - this might not be good default. You can change the number of bins (e.g.: bins = 50 or the bin width e.g.: binwidth = 0.5.
a + geom_histogram(bins = 50, color = "black", fill = "grey") + geom_vline(aes(xintercept = mean(weight)), color = "#FC4E07", linetype = "dashed", size = 1) + theme_minimal()

a + geom_histogram(aes(y = ..density..), bins = 50)

# Change color by sex
a + geom_histogram(aes(color = sex), fill = "white", bins = 50) + theme_minimal()

# Position adjustment "identity"(overlaid)
a + geom_histogram(aes(color = sex), fill = "white", bins = 50, alpha = 0.6, position = "identity")

# Position adjustment "dodge" (Interleaved)
# Add mean lines and color by sex
a + geom_histogram(aes(color = sex), fill = "white", alpha = 0.6, position = "dodge", bins = 50) + geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

# Change fill, color manually
# Change outline color manually
a + geom_histogram(aes(color = sex), fill = "white", alpha = 0.4, position = "identity", bins = 50) + scale_color_manual(values = c("#00AFBB","#E7B800"))

# Change fill and outline color manually
# a + geom_histogram(aes(color = sex), fill = "white", alpha =0.4, position = "identity", bins = 50) + scale_fill_manual(values = c("#00AFBB", "#E7B800")) + scale_color_manual(values = c("#00AFBB", "#E7B800")) 
# wrong command, I have to assign fill first by group

a + geom_histogram(aes(color = sex, fill = sex), alpha =0.4, position = "identity", bins = 50) + scale_fill_manual(values = c("#00AFBB", "#E7B800")) + scale_color_manual(values = c("#00AFBB", "#E7B800")) 

## Combine Histogram and Density Plots

# Plot histogram with density values on y-axis(instead of count values).
# Add density plot with transparent density plot

# Histogram with density plot
a + geom_histogram(aes(y = ..density..),color = "black", fill = "white") + geom_density(alpha = 0.2, fill = "#FF6666") + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Color by groups
a + geom_histogram(aes(y = ..density.., color = sex, fill = sex),  alpha = 0.4, position = "identity") + geom_density(aes(color = sex), size =1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

1.4 Frequency Polygon

Very close to histogram plots

  • Histogram use bars
  • Frequency polygons use lines.

alpha, color, linetype, size

# Basic plot
a + geom_freqpoly(bins = 30) + theme_minimal()

# Change color and linetype by sex
# Use custom color palettes
a + geom_freqpoly(aes(color = sex, linetype = sex), bins = 30 ) +  scale_color_manual(values = c("#999999", "#E69F00"))+theme_minimal()

# y density
a + geom_freqpoly(aes(y = ..density.., color = sex, linetype = sex), bins = 30 ) +  scale_color_manual(values = c("#999999", "#E69F00"))+theme_minimal()

1.5 Dot Plots for One Variable

Not suitable for one variable, it’s ugly.

a + geom_dotplot(aes(fill = sex))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

1.6 ECDF Plots

Empirical Cumulative Density Function

alpha, color, linetype, size

a + stat_ecdf(geom = "point")

a + stat_ecdf(geom = "step")

1.7 QQ Plots

Quantile - Quantie plots to chech whether a given data follows normal distribution.

alpha, color, shape, size

data(mtcars)
mtcars <- as_data_frame(mtcars)
mtcars
## # A tibble: 32 × 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
## *  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4
## 2   21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4
## 3   22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
## 4   21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1
## 5   18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2
## 6   18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1
## 7   14.3     8 360.0   245  3.21 3.570 15.84     0     0     3     4
## 8   24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
## 9   22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2
## 10  19.2     6 167.6   123  3.92 3.440 18.30     1     0     4     4
## # ... with 22 more rows
mtcars <- mutate(mtcars, cyl = as.factor(cyl))
mtcars
## # A tibble: 32 × 11
##      mpg    cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   21.0      6 160.0   110  3.90 2.620 16.46     0     1     4     4
## 2   21.0      6 160.0   110  3.90 2.875 17.02     0     1     4     4
## 3   22.8      4 108.0    93  3.85 2.320 18.61     1     1     4     1
## 4   21.4      6 258.0   110  3.08 3.215 19.44     1     0     3     1
## 5   18.7      8 360.0   175  3.15 3.440 17.02     0     0     3     2
## 6   18.1      6 225.0   105  2.76 3.460 20.22     1     0     3     1
## 7   14.3      8 360.0   245  3.21 3.570 15.84     0     0     3     4
## 8   24.4      4 146.7    62  3.69 3.190 20.00     1     0     4     2
## 9   22.8      4 140.8    95  3.92 3.150 22.90     1     0     4     2
## 10  19.2      6 167.6   123  3.92 3.440 18.30     1     0     4     4
## # ... with 22 more rows
p <- ggplot(mtcars, aes(sample = mpg))

# Basic plot
p + stat_qq()

# Change point shapes by groups
# Use custom color palettes
p + stat_qq(aes(shape = cyl, color = cyl)) + scale_color_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))

1.8 Bar Plots of Counts

For one discrete variable

alpha, color, fill, linetype, size

data(mpg)
mpg <- as_data_frame(mpg)
mpg
## # A tibble: 234 × 11
##    manufacturer      model displ  year   cyl      trans   drv   cty   hwy
##           <chr>      <chr> <dbl> <int> <int>      <chr> <chr> <int> <int>
## 1          audi         a4   1.8  1999     4   auto(l5)     f    18    29
## 2          audi         a4   1.8  1999     4 manual(m5)     f    21    29
## 3          audi         a4   2.0  2008     4 manual(m6)     f    20    31
## 4          audi         a4   2.0  2008     4   auto(av)     f    21    30
## 5          audi         a4   2.8  1999     6   auto(l5)     f    16    26
## 6          audi         a4   2.8  1999     6 manual(m5)     f    18    26
## 7          audi         a4   3.1  2008     6   auto(av)     f    18    27
## 8          audi a4 quattro   1.8  1999     4 manual(m5)     4    18    26
## 9          audi a4 quattro   1.8  1999     4   auto(l5)     4    16    25
## 10         audi a4 quattro   2.0  2008     4 manual(m6)     4    20    28
## # ... with 224 more rows, and 2 more variables: fl <chr>, class <chr>
ggplot(mpg, aes(fl)) + geom_bar(fill = "steelblue") + theme_minimal()

2 Plot Two Variables -X & Y: Both Continuous or Discrete

2.1 Scatter plots: Continuous X and Y

  • geom_pint()
  • geom_smooth()
  • geom_quantile()
  • geom_rug()
  • geom_jitter()
  • geom_text()

geom_point
alpha, color, fill, shape, size

# Data format
mtcars
## # A tibble: 32 × 11
##      mpg    cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   21.0      6 160.0   110  3.90 2.620 16.46     0     1     4     4
## 2   21.0      6 160.0   110  3.90 2.875 17.02     0     1     4     4
## 3   22.8      4 108.0    93  3.85 2.320 18.61     1     1     4     1
## 4   21.4      6 258.0   110  3.08 3.215 19.44     1     0     3     1
## 5   18.7      8 360.0   175  3.15 3.440 17.02     0     0     3     2
## 6   18.1      6 225.0   105  2.76 3.460 20.22     1     0     3     1
## 7   14.3      8 360.0   245  3.21 3.570 15.84     0     0     3     4
## 8   24.4      4 146.7    62  3.69 3.190 20.00     1     0     4     2
## 9   22.8      4 140.8    95  3.92 3.150 22.90     1     0     4     2
## 10  19.2      6 167.6   123  3.92 3.440 18.30     1     0     4     4
## # ... with 22 more rows
b <- ggplot(mtcars, aes(x = wt, y= mpg))
# x weight
# y miles/gallon
#Basic scatter plots
b + geom_point(color = "#00AFBB")

# Change the point size, and shape
b + geom_point(color = "#00AFBB", size = 2, shape = 23)

# Control point size by continuous variable values
# qsec 1/4 mile time
b + geom_point(aes(size = qsec), color = "#00AFBB")

# Label text
b + geom_point() + geom_text(label = rownames(mtcars), nudge_y = 0.8)

# Change shape, color, size automatically
# Change point shape by the level of cyl
b + geom_point(aes(shape = cyl))

# Change point shape and colors
b + geom_point(aes(color = cyl, shape = cyl))

# Change shape, color, size manually
# Change the point sizes manually
b + geom_point(aes(color = cyl, shape = cyl, size = cyl)) + scale_size_manual(values = c(2,3,4))

# Change the point shapes and colors manually
b + geom_point(aes(color = cyl, shape = cyl)) + scale_shape_manual(values = c(3,16,17)) + scale_color_manual(values = c('#999999','#E69F00', '#56B4E9'))

# Use brewer color palettes
b + geom_point(aes(color = cyl, shape = cyl)) + scale_color_brewer(palette = "Dark2") + theme_minimal()

# Use grey scale
b + geom_point(aes(color = cyl, shape = cyl)) + scale_color_grey() + theme_minimal()

#####################################################
##Add regression line or smoothed conditional mean###
#####################################################
#geom_smooth(), geom_abline()
#alpha, color, fill, shape, linetype, size
#geom_smooth(method = "auto")
#method:loess->local regression, lm-> linear regression

# Add regression line
b + geom_point() + geom_smooth(method = lm)

# Point + regression line
# Remove the confidence interval
b + geom_point() + geom_smooth(method = lm, se = FALSE)

# loess method, local regression fitting
b + geom_point() + geom_smooth()

# Change the color and shape by groups 吧
b + geom_point(aes(color = cyl, shape = cyl)) + geom_smooth(aes(color = cyl, fill = cyl), method = lm)

# Remove confidence intervals
# Extend the regression lines: fullrage
b + geom_point(aes(color = cyl, shape = cyl)) + geom_smooth(aes(color = cyl), method = lm, se = FALSE, fullrange = TRUE)

# Add marginal rugs to a scatter plot
#geom_rug(sides = "bl")
# sides: a string, "trbl", top, right, bottom, left.
# Add marginal rugs
b + geom_point() + geom_rug()

# Change the color by group
b + geom_point(aes(color = cyl)) + geom_rug(aes(color = cyl))

# Add marginal rugs using faithful data
data(faithful)
faithful <- as_data_frame(faithful)
faithful
## # A tibble: 272 × 2
##    eruptions waiting
## *      <dbl>   <dbl>
## 1      3.600      79
## 2      1.800      54
## 3      3.333      74
## 4      2.283      62
## 5      4.533      85
## 6      2.883      55
## 7      4.700      88
## 8      3.600      85
## 9      1.950      51
## 10     4.350      85
## # ... with 262 more rows
ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point() + geom_rug()

# Jitter points to reduce overplotting
# geom_jitter(), position_jitter()
#alpha, color, fill, shape, size

# Use mpg data
p <- ggplot(mpg, aes(displ, hwy))

# Default sactter plot
p + geom_point()

# Use jitter to reduce overplotting
p + geom_jitter(position = position_jitter(width = 0.5, height = 0.5))

select(mpg, displ, hwy) %>% arrange(-hwy) %>% filter(displ == 1.9)
## # A tibble: 3 × 2
##   displ   hwy
##   <dbl> <int>
## 1   1.9    44
## 2   1.9    44
## 3   1.9    41
##
#Text annotation
#geom_text()
#label, alpha, angle, color, family, fontface, hjust, lineheight, size, vjust

b + geom_text(aes(label = rownames(mtcars)), size = 3)

2.2 Continuous bivariate distribution

  • geom_bin2d()
  • geom_hex()
  • geom_density_2d()
c <- ggplot(diamonds, aes(carat, price))
# Add heatmap of 2d bin counts
# geom_bin2d produce a scatter plot with rectangular bins.
# stat_bin_2d(), stat_summary_2d()
# max, xmin, ymax, ymin, alpha, color, fill, linetype, size
c + geom_bin2d()

# Change the number of bins
c + geom_bin2d(bins = 15)

# Specify the width of bins
c + geom_bin2d(binwidth = c(1,1000))

c + stat_bin_2d()

c + stat_summary_2d(aes(z = depth))

# Add hexagon bining
#geom_hex()
# stat_bin_hex(), stat_summary_hex()
# alpha, color, fill, size
require(hexbin)
## Loading required package: hexbin
c + geom_hex()

# Change the number of bins
c + geom_hex(bins = 10)

c + stat_bin_hex()

c + stat_summary_hex(aes(z = depth))

# 2D density estimation
# geom_density_2d()
# stat_density_2d()
# alpha, color, linetype, size

# Scatter plot
sp <- ggplot(faithful, aes(x = eruptions, y = waiting))
select(faithful, eruptions, waiting)
## # A tibble: 272 × 2
##    eruptions waiting
## *      <dbl>   <dbl>
## 1      3.600      79
## 2      1.800      54
## 3      3.333      74
## 4      2.283      62
## 5      4.533      85
## 6      2.883      55
## 7      4.700      88
## 8      3.600      85
## 9      1.950      51
## 10     4.350      85
## # ... with 262 more rows
# Default plot
sp + geom_density_2d(color = "#E7B800")

# Add points
sp + geom_point(color = "#00AFBB") + geom_density_2d(color = "#E7B800")

# Use stat_density_2d with geom = "polygon"
sp + geom_point() + stat_density_2d(aes(fill = ..level..), geom = "polygon")

# Change the gradient color
sp + geom_point() + stat_density_2d(aes(fill = ..level..), geom = "polygon") + scale_fill_gradient(low = "#00AFBB", high = "#FC3E07")

# Gradient

2.3 Two variables: Discrete X, Discrete Y

geom_jitter
alpha, color, fill, shape, size

ggplot(diamonds, aes(cut, color)) + geom_jitter(aes(color = cut), size = 0.5)

select(diamonds, cut, color)
## # A tibble: 53,940 × 2
##          cut color
##        <ord> <ord>
## 1      Ideal     E
## 2    Premium     E
## 3       Good     E
## 4    Premium     I
## 5       Good     J
## 6  Very Good     J
## 7  Very Good     I
## 8  Very Good     H
## 9       Fair     E
## 10 Very Good     H
## # ... with 53,930 more rows

3 Plot Two Variables - X & Y: Discrete X and Continuous Y

  • geom_boxplot()
  • geom_violin()
  • geom_dotplot()
  • geom_jitter()
  • geom_line()
  • geom_bar()
data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
ToothGrowth <- as_data_frame(ToothGrowth)
ToothGrowth
## # A tibble: 60 × 3
##      len   supp   dose
##    <dbl> <fctr> <fctr>
## 1    4.2     VC    0.5
## 2   11.5     VC    0.5
## 3    7.3     VC    0.5
## 4    5.8     VC    0.5
## 5    6.4     VC    0.5
## 6   10.0     VC    0.5
## 7   11.2     VC    0.5
## 8   11.2     VC    0.5
## 9    5.2     VC    0.5
## 10   7.0     VC    0.5
## # ... with 50 more rows
e <- ggplot(ToothGrowth, aes(x = dose, y = len))

3.1 Box Plots

alpha, color, linetype, shape, size, fill

# Basic box plot
e + geom_boxplot()

# Rotate the box plot
e + geom_boxplot() + coord_flip()

# Notched box plot
e + geom_boxplot(notch = TRUE)

# Box plot with mean points
e + geom_boxplot() + stat_summary(fun.y = mean, geom = "point", shape = 18, size = 4, color = "blue")